The Dataset

Raw dataset: COVID-19 infection and death statistics from U.S. counties (sourced from NYT), combined with economic, education, and population data (sourced from various government agencies) and also survey responses about mask-wearing frequencies (sourced from NYT). 3141 complete observations on 19 metric variables and 6 categorical variables. To avoid any outliers due to population size differences between counties, all variables are scaled as a percentage of population. Variable descriptions can be found here.

Dataset for this pset: Adapted from the dataset described above, the dataset used in this pset has the aggregated Rural-Urban code (Rural_Urban_Code_2013) as the categorical variable. The five continuous variables used are cumulative COVID-19 cases as percent of total county population (Covid_Confirmed_Cases_as_pct), median household income (Median_Household_Income_2019), unemployment rate (Unemployment_Rate_2019), percent poverty (Percent_Poverty_2019), and percent of adults with less than a high school degree (Percent_Adults_Less_Than_HS). The unique county identifier (FIPS) is also contained in the dataset.

Note that by the end of number 1, the dataset is simplified to include Rural_Urban_Code_2013 as the categorical variable and three continuous variables: logMedian_Household_Income_2019, logPercent_Poverty_2019, and logCovid_Confirmed_Cases_as_pct.

The Rural-Urban Codes are numbered 1-9 according to the following descriptions provided by the USDA.

We will regroup codes 1 through 9 as into three groups: (1) “Urban” for codes 1-3, (2) “Suburban” for codes 4-6, and (3) “Rural” for codes 7-9.

raw
## function (length = 0L) 
## .Internal(vector("raw", length))
## <bytecode: 0x7fa82feef8d8>
## <environment: namespace:base>
url_data = ("https://evancollins.com/covid_and_demographics.csv")
raw <- read.csv(url(url_data))
raw <- as.data.frame(raw)
db <- subset(raw, select=c(4,20,10,11,13,14,22)) # include only pertinent columns
# Regroup Rural-Urban Codes
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 1] <- "Urban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 2] <- "Urban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 3] <- "Urban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 4] <- "Suburban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 5] <- "Suburban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 6] <- "Suburban"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 7] <- "Rural"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 8] <- "Rural"
db$Rural_Urban_Code_2013[db$Rural_Urban_Code_2013 == 9] <- "Rural"

1 Evaluate assumptions

One of the first things is to compare the continuous variables between groups.

boxplot(Covid_Confirmed_Cases_as_pct ~ Rural_Urban_Code_2013, data = db, horizontal = T, main = "Cumulative Percent COVID-19 Death by Rural-Urban Group")

boxplot(Median_Household_Income_2019 ~ Rural_Urban_Code_2013, data = db, horizontal = T, main = "Median Household Incole by Rural-Urban Group")

boxplot(Unemployment_Rate_2019 ~ Rural_Urban_Code_2013, data = db, horizontal = T, main = "Unemployment Rate by Rural-Urban Group")

boxplot(Percent_Poverty_2019 ~ Rural_Urban_Code_2013, data = db, horizontal = T, main = "Percent Poverty by Rural-Urban Group")

boxplot(Percent_Adults_Less_Than_HS ~ Rural_Urban_Code_2013, data = db, horizontal = T, main = "Percent Adults Less than HS Education by Rural-Urban Group")

Next, we should check the assumption of multivariate normality in each group.

#see if data is multivariate normal in EACH GROUP
#get online function
source("http://www.reuningscherer.net/multivariate/R/CSQPlot.r.txt")

#examine multivariate normality within each rural-urban group
# par(mfrow = c(1,3), pty = "s", cex = 0.8)
CSQPlot(db[db$Rural_Urban_Code_2013 == "Urban", 3:7], label = "Urban")

CSQPlot(db[db$Rural_Urban_Code_2013 == "Suburban", 3:7], label = "Suburban")

CSQPlot(db[db$Rural_Urban_Code_2013 == "Rural", 3:7], label = "Rural")

In general, most data points for each rural-urban group lie within the 95% confidence limits in the above plots. Taking log transforms and square root transforms for all continuous variables were attempted, but this resulted in even greater deviation from the 95% confidence limits. Let’s try digging in deeper as to how to solve this lack of multivariate normality.

db$sqrtUnemployment_Rate_2019 <- sqrt(db$Unemployment_Rate_2019)
db$sqrtMedian_Household_Income_2019 <- sqrt(db$Median_Household_Income_2019)
db$sqrtPercent_Poverty_2019 <- sqrt(db$Percent_Poverty_2019)
db$sqrtPercent_Adults_Less_Than_HS <- sqrt(db$Percent_Adults_Less_Than_HS)
db$sqrtCovid_Confirmed_Cases_as_pct <- sqrt(db$Covid_Confirmed_Cases_as_pct)

db$logUnemployment_Rate_2019 <- log(db$Unemployment_Rate_2019 + 0.0001)
db$logMedian_Household_Income_2019 <- log(db$Median_Household_Income_2019 + 0.0001)
db$logPercent_Poverty_2019 <- log(db$Percent_Poverty_2019 + 0.0001)
db$logPercent_Adults_Less_Than_HS <- log(db$Percent_Adults_Less_Than_HS + 0.0001)
db$logCovid_Confirmed_Cases_as_pct <- log(db$Covid_Confirmed_Cases_as_pct + 0.0001)

#examine multivariate normality within each rural-urban group
# sqrt
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Urban", 8:12], label = "Urban")
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Suburban", 8:12], label = "Suburban")
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Rural", 8:12], label = "Rural")
 
# log
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Urban", 13:17], label = "Urban")
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Suburban", 13:17], label = "Suburban")
#CSQPlot(db[db$Rural_Urban_Code_2013 == "Rural", 13:17], label = "Rural")

We checked normal quantile plots for each log variable. All were rougly linear (and hence normal) with one notable exception: logCovid_Confirmed_Cases_as_pct.

#qqnorm(db$logUnemployment_Rate_2019)
#qqnorm(db$logMedian_Household_Income_2019)
#qqnorm(db$logPercent_Poverty_2019)
#qqnorm(db$logPercent_Adults_Less_Than_HS)
qqnorm(db$logCovid_Confirmed_Cases_as_pct)

Let’s try omitting outliers. The outlier exclusion method is to exclude any logCovid_Confirmed_Cases_as_pct values that are more than 1.5 x IQR lower than our first quartile and more than than 1.5 x IQR above the third quartile.

db[which(db$logCovid_Confirmed_Cases_as_pct > quantile(db$logCovid_Confirmed_Cases_as_pct, .25) - 1.5*IQR(db$logCovid_Confirmed_Cases_as_pct) & db$logCovid_Confirmed_Cases_as_pct < quantile(db$logCovid_Confirmed_Cases_as_pct, .75) + 1.5*IQR(db$logCovid_Confirmed_Cases_as_pct)),] -> db_clean

# Report percentage of items this restriction included
(length(db$logCovid_Confirmed_Cases_as_pct) - length(db_clean$logCovid_Confirmed_Cases_as_pct)) / length(db$logCovid_Confirmed_Cases_as_pct)
## [1] 0.055078

This outlier exclusion method removed about 5% of items, which is somewhat high. The outlier exclusion method of 3 x IQR only removed 2% of items, but ultimately yielded minimal improvements to our chi-square quantile plots.

Now, with our COVID cases outlier-clean dataset, let’s check the normal quantile plot of logCovid_Confirmed_Cases_as_pct.

qqnorm(db_clean$logCovid_Confirmed_Cases_as_pct)

This assumes a much more normal pattern.

Now, with the COVID cases outlier-clean dataset, we should check the assumption of multivariate normality in each group. We will use the log-transformed continuous variables.

#see if data is multivariate normal in EACH GROUP
#get online function
source("http://www.reuningscherer.net/multivariate/R/CSQPlot.r.txt")

#examine multivariate normality within each rural-urban group
# par(mfrow = c(1,3), pty = "s", cex = 0.8)
CSQPlot(db_clean[db_clean$Rural_Urban_Code_2013 == "Urban", 13:17], label = "Urban Counties")

CSQPlot(db_clean[db_clean$Rural_Urban_Code_2013 == "Suburban", 13:17], label = "Suburban Counties")

CSQPlot(db_clean[db_clean$Rural_Urban_Code_2013 == "Rural", 13:17], label = "Rural Counties")

Still, it seems that the data does not assume multivariate normality.

Let’s try applying the 1.5 x IQR outlier exclusion method to the other four log-transformed variables. Although their respective qqnorm plots above did not demonstrate clear issues, there were minor outliers that could be affecting multivariate normality.

db_clean[which(db_clean$logUnemployment_Rate_2019 > quantile(db_clean$logUnemployment_Rate_2019, .25) - 1.5*IQR(db_clean$logUnemployment_Rate_2019) & db_clean$logUnemployment_Rate_2019 < quantile(db_clean$logUnemployment_Rate_2019, .75) + 1.5*IQR(db_clean$logUnemployment_Rate_2019)),] -> db_clean_all

db_clean_all[which(db_clean_all$logMedian_Household_Income_2019 > quantile(db_clean_all$logMedian_Household_Income_2019, .25) - 1.5*IQR(db_clean_all$logMedian_Household_Income_2019) & db_clean_all$logMedian_Household_Income_2019 < quantile(db_clean_all$logMedian_Household_Income_2019, .75) + 1.5*IQR(db_clean_all$logMedian_Household_Income_2019)),] -> db_clean_all

db_clean_all[which(db_clean_all$logPercent_Poverty_2019 > quantile(db_clean_all$logPercent_Poverty_2019, .25) - 1.5*IQR(db_clean_all$logPercent_Poverty_2019) & db_clean_all$logPercent_Poverty_2019 < quantile(db_clean_all$logPercent_Poverty_2019, .75) + 1.5*IQR(db_clean$logPercent_Poverty_2019)),] -> db_clean_all

db_clean_all[which(db_clean_all$logPercent_Adults_Less_Than_HS > quantile(db_clean_all$logPercent_Adults_Less_Than_HS, .25) - 1.5*IQR(db_clean_all$logPercent_Adults_Less_Than_HS) & db_clean_all$logPercent_Adults_Less_Than_HS < quantile(db_clean_all$logPercent_Adults_Less_Than_HS, .75) + 1.5*IQR(db_clean_all$logPercent_Adults_Less_Than_HS)),] -> db_clean_all

Now, with the all univariate outliers removed across all continuous variables, we should check the assumption of multivariate normality in each group. We will use the log-transformed continuous variables.

#see if data is multivariate normal in EACH GROUP
#get online function
source("http://www.reuningscherer.net/multivariate/R/CSQPlot.r.txt")

#examine multivariate normality within each rural-urban group
# par(mfrow = c(1,3), pty = "s", cex = 0.8)
CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Urban", 13:17], label = "Urban Counties")

CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Suburban", 13:17], label = "Suburban Counties")

CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Rural", 13:17], label = "Rural Counties")

The above transformations have enabled the data to assume a more multivariate distribution. However, more than 5% of data lies outside the 95% confidence limits.

Let try to us reduce the number of continuous variable columns of our dataset that will enable better multivariate normality.

#see if data is multivariate normal in EACH GROUP
#get online function
source("http://www.reuningscherer.net/multivariate/R/CSQPlot.r.txt")

#examine multivariate normality within each rural-urban group
# par(mfrow = c(1,3), pty = "s", cex = 0.8)
CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Urban", c(14,15,17)], label = "Urban Counties")

CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Suburban", c(14,15,17)], label = "Suburban Counties")

CSQPlot(db_clean_all[db_clean_all$Rural_Urban_Code_2013 == "Rural", c(14,15,17)], label = "Rural Counties")

This conforms with multivariate normality within each group.

Hence, for the remainder of this pset, the dataset analyzed db_final will include rural urban code as the categorical variable and three continuous variables: logMedian_Household_Income_2019, logPercent_Poverty_2019, and logCovid_Confirmed_Cases_as_pct.

db_final <- db_clean_all[, c(1,2,14,15,17)]

# Create column for rural urban code as number
# 1 = Urban, 2 = Suburban, 3 = Rural
db_final$Rural_Urban_Code_2013_Number <- db_final$Rural_Urban_Code_2013
db_final$Rural_Urban_Code_2013_Number[db_final$Rural_Urban_Code_2013_Number == "Urban"] <- 1
db_final$Rural_Urban_Code_2013_Number[db_final$Rural_Urban_Code_2013_Number == "Suburban"] <- 2
db_final$Rural_Urban_Code_2013_Number[db_final$Rural_Urban_Code_2013_Number == "Rural"] <- 3
db_final$Rural_Urban_Code_2013_Number <- as.numeric(db_final$Rural_Urban_Code_2013_Number)

head(db_final)
##   FIPS Rural_Urban_Code_2013 logMedian_Household_Income_2019
## 1 1001                 Urban                        10.97221
## 2 1003                 Urban                        10.99995
## 3 1005              Suburban                        10.49050
## 4 1007                 Urban                        10.77725
## 5 1009                 Urban                        10.87620
## 6 1011              Suburban                        10.37055
##   logPercent_Poverty_2019 logCovid_Confirmed_Cases_as_pct
## 1                2.493214                       -2.245419
## 2                2.312545                       -2.471903
## 3                3.299537                       -2.502412
## 4                3.010626                       -2.248337
## 5                2.791171                       -2.276608
## 6                3.401201                       -2.187757
##   Rural_Urban_Code_2013_Number
## 1                            1
## 2                            1
## 3                            2
## 4                            1
## 5                            1
## 6                            2

It’s helpful to view where the groups live relative to each other two variables at a time.

#make matrix plot to look at differences between groups
plot(db_final[,3:5], col = db_final[,6]+3, pch = db_final[,6]+15, cex = 1.2)

It seems like the covariance ‘footprints’ are not the same between the urban/suburban/rural groups for these three continuous variables.

names(db_final)
## [1] "FIPS"                            "Rural_Urban_Code_2013"          
## [3] "logMedian_Household_Income_2019" "logPercent_Poverty_2019"        
## [5] "logCovid_Confirmed_Cases_as_pct" "Rural_Urban_Code_2013_Number"
#visually compare sample covariance matrices
print("Covariance Matrix for Urban Counties")
## [1] "Covariance Matrix for Urban Counties"
cov(db_final[db_final$Rural_Urban_Code_2013=="Urban", 3:5])
##                                 logMedian_Household_Income_2019
## logMedian_Household_Income_2019                      0.04126981
## logPercent_Poverty_2019                             -0.06142845
## logCovid_Confirmed_Cases_as_pct                     -0.01007017
##                                 logPercent_Poverty_2019
## logMedian_Household_Income_2019             -0.06142845
## logPercent_Poverty_2019                      0.12503923
## logCovid_Confirmed_Cases_as_pct              0.01422176
##                                 logCovid_Confirmed_Cases_as_pct
## logMedian_Household_Income_2019                     -0.01007017
## logPercent_Poverty_2019                              0.01422176
## logCovid_Confirmed_Cases_as_pct                      0.06893112
print("Covariance Matrix for Suburban Counties")
## [1] "Covariance Matrix for Suburban Counties"
cov(db_final[db_final$Rural_Urban_Code_2013=="Suburban", 3:5])
##                                 logMedian_Household_Income_2019
## logMedian_Household_Income_2019                     0.031210862
## logPercent_Poverty_2019                            -0.053551499
## logCovid_Confirmed_Cases_as_pct                    -0.007670521
##                                 logPercent_Poverty_2019
## logMedian_Household_Income_2019              -0.0535515
## logPercent_Poverty_2019                       0.1142988
## logCovid_Confirmed_Cases_as_pct               0.0102640
##                                 logCovid_Confirmed_Cases_as_pct
## logMedian_Household_Income_2019                    -0.007670521
## logPercent_Poverty_2019                             0.010264005
## logCovid_Confirmed_Cases_as_pct                     0.077835876
print("Covariance Matrix for Rural Counties")
## [1] "Covariance Matrix for Rural Counties"
cov(db_final[db_final$Rural_Urban_Code_2013=="Rural", 3:5])
##                                 logMedian_Household_Income_2019
## logMedian_Household_Income_2019                     0.038748223
## logPercent_Poverty_2019                            -0.060478379
## logCovid_Confirmed_Cases_as_pct                     0.003524888
##                                 logPercent_Poverty_2019
## logMedian_Household_Income_2019           -0.0604783788
## logPercent_Poverty_2019                    0.1216473946
## logCovid_Confirmed_Cases_as_pct            0.0004314932
##                                 logCovid_Confirmed_Cases_as_pct
## logMedian_Household_Income_2019                    0.0035248880
## logPercent_Poverty_2019                            0.0004314932
## logCovid_Confirmed_Cases_as_pct                    0.1049023613
#compare standard deviations in each group
sumstats <- round(sqrt(aggregate(db_final[,3:5], by = list(db_final[,6]),FUN=var)),2)[,-1]
rownames(sumstats) <- c("Urban","Suburban", "Rural")
print("Standard Deviations by Group")
## [1] "Standard Deviations by Group"
sumstats
##          logMedian_Household_Income_2019 logPercent_Poverty_2019
## Urban                               0.20                    0.35
## Suburban                            0.18                    0.34
## Rural                               0.20                    0.35
##          logCovid_Confirmed_Cases_as_pct
## Urban                               0.26
## Suburban                            0.28
## Rural                               0.32
#calculate Box's M statistic
boxM(db_final[,3:5], db_final$Rural_Urban_Code_2013)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  db_final[, 3:5]
## Chi-Sq (approx.) = 189.51, df = 12, p-value < 2.2e-16

Wth a p-value significantly below the 0.05 threshold, it appears that the covariances matrices are NOT the same between groups, so we fit both linear DA and quadratic DA.

2 Discriminant Analysis

Linear Discriminant Analysis

#run linear discriminant analysis
county.disc <- lda(db_final[, 3:5], grouping = db_final$Rural_Urban_Code_2013)
summary(county.disc)
##         Length Class  Mode     
## prior   3      -none- numeric  
## counts  3      -none- numeric  
## means   9      -none- numeric  
## scaling 6      -none- numeric  
## lev     3      -none- character
## svd     2      -none- numeric  
## N       1      -none- numeric  
## call    3      -none- call
#get univarite and multivariate comparisons
county.manova <- manova(as.matrix(db_final[, 3:5]) ~ db_final$Rural_Urban_Code_2013)
summary.manova(county.manova, test = "Wilks")
##                                  Df   Wilks approx F num Df den Df    Pr(>F)
## db_final$Rural_Urban_Code_2013    2 0.74937    146.7      6   5672 < 2.2e-16
## Residuals                      2838                                         
##                                   
## db_final$Rural_Urban_Code_2013 ***
## Residuals                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(county.manova)
##  Response logMedian_Household_Income_2019 :
##                                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## db_final$Rural_Urban_Code_2013    2  22.754 11.3771  304.09 < 2.2e-16 ***
## Residuals                      2838 106.178  0.0374                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response logPercent_Poverty_2019 :
##                                  Df Sum Sq Mean Sq F value    Pr(>F)    
## db_final$Rural_Urban_Code_2013    2  24.49 12.2468  101.47 < 2.2e-16 ***
## Residuals                      2838 342.52  0.1207                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response logCovid_Confirmed_Cases_as_pct :
##                                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## db_final$Rural_Urban_Code_2013    2   4.697 2.34843   28.11 8.153e-13 ***
## Residuals                      2838 237.098 0.08354                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Need to ask about code box below;

for some reason county.disc$scaling is a 3x2 matrix; so forcing into one columns results in 6 rows, which can’t be matrix multiplied by db_final[, 3:5]

this is because the scaling array has LD1 and LD2 values (which is 2 total (#groups - 1))

i just considered the discriminant functions separately

#get the scores - matrix product of original variables with DA coefficients
scores1 <- as.matrix(db_final[, 3:5])%*%matrix(c(county.disc$scaling[,1]), ncol = 1)

boxplot(scores1 ~ db_final$Rural_Urban_Code_2013, lwd = 3, col = c("red","blue", "green"), horizontal = T, main = "Rural-Urban Discriminant Scores by Function (LD1)", ylab = "Function")

#get the scores - matrix product of original variables with DA coefficients
scores2 <- as.matrix(db_final[, 3:5])%*%matrix(c(county.disc$scaling[,2]), ncol = 1)

boxplot(scores2 ~ db_final$Rural_Urban_Code_2013, lwd = 3, col = c("red","blue", "green"), horizontal = T, main = "Rural-Urban Discriminant Scores by Function (LD2)", ylab = "Function")

We can see that there is more evidence of differences between county type in the LD1 direction of discrimination.

Quadratic Disciminant Analysis

#run quadratic discriminant analysis
countyQ.disc <- qda(db_final[,3:5], grouping = db_final$Rural_Urban_Code_2013)
summary(countyQ.disc)
##         Length Class  Mode     
## prior    3     -none- numeric  
## counts   3     -none- numeric  
## means    9     -none- numeric  
## scaling 27     -none- numeric  
## ldet     3     -none- numeric  
## lev      3     -none- character
## N        1     -none- numeric  
## call     3     -none- call
# raw results - more accurate than using linear DA
ctrawQ <- table(db_final$Rural_Urban_Code_2013, predict(countyQ.disc)$class)
ctrawQ
##           
##            Rural Suburban Urban
##   Rural      525      265   153
##   Suburban   259      390   204
##   Urban      141      230   674
#cross validated results - better than CV for linear DA
county.discCVQ <- qda(db_final[,3:5], grouping = db_final$Rural_Urban_Code_2013, CV = TRUE)
ctCVQ <- table(db_final$Rural_Urban_Code_2013, county.discCVQ$class)
ctCVQ
##           
##            Rural Suburban Urban
##   Rural      522      268   153
##   Suburban   262      385   206
##   Urban      141      230   674
# total percent correct
round(sum(diag(prop.table(ctCVQ))),2)
## [1] 0.56

Quadratic discriminant analysis leads to correct classifications in 55% of instances. This is slightly better than linear discriminant analysis, which led to correct classifications in 54% of instances.

Stepwise Discriminant Analysis

From the T-test results above, we suspect that all three continuous variables are significant in this classification. Nevertheless, let’s do stepwise discriminant analysis to verify their respective significances.

#STEPWISE DA
#Here is leave-one out classification which is relatively stable - keeps only N
(step1 <- stepclass(Rural_Urban_Code_2013 ~  logMedian_Household_Income_2019 + logPercent_Poverty_2019 + logCovid_Confirmed_Cases_as_pct, data = db_final, method = "lda", direction = "both", fold = nrow(db_final)))
##  `stepwise classification', using 2841-fold cross-validated correctness rate of method lda'.
## 2841 observations of 3 variables in 3 classes; direction: both
## stop criterion: improvement less than 5%.
## correctness rate: 0.47237;  in: "logMedian_Household_Income_2019";  variables (1): logMedian_Household_Income_2019 
## correctness rate: 0.53397;  in: "logPercent_Poverty_2019";  variables (2): logMedian_Household_Income_2019, logPercent_Poverty_2019 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##       0.000       1.000       4.646
## method      : lda 
## final model : Rural_Urban_Code_2013 ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019
## <environment: 0x7fa848b20c00>
## 
## correctness rate = 0.534
names(step1)
## [1] "call"                "method"              "start.variables"    
## [4] "process"             "model"               "result.pm"          
## [7] "runtime"             "performance.measure" "formula"
step1$result.pm
## crossval.rate      apparent 
##     0.5339669     0.4646251
#Do stepwise quadratic DA 
(step3 <- stepclass(Rural_Urban_Code_2013 ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019 + logCovid_Confirmed_Cases_as_pct, data = db_final, method = "qda", direction = 'both', fold = nrow(db_final)))
##  `stepwise classification', using 2841-fold cross-validated correctness rate of method qda'.
## 2841 observations of 3 variables in 3 classes; direction: both
## stop criterion: improvement less than 5%.
## correctness rate: 0.47166;  in: "logMedian_Household_Income_2019";  variables (1): logMedian_Household_Income_2019 
## correctness rate: 0.53889;  in: "logPercent_Poverty_2019";  variables (2): logMedian_Household_Income_2019, logPercent_Poverty_2019 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##       0.000       0.000      35.896
## method      : qda 
## final model : Rural_Urban_Code_2013 ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019
## <environment: 0x7fa83995e908>
## 
## correctness rate = 0.5389

For both stepwise LDA and QDA, the two significant variables are determined to be logMedian_Household_Income_2019 and logPercent_Poverty_2019.

#plot results in space spanned by chosen variables
#First, linear DA with N and GS - linear partition of space
partimat(as.factor(Rural_Urban_Code_2013) ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019, data = db_final, method = "lda")

#Second, QDA - quadratic partition of space
partimat(as.factor(Rural_Urban_Code_2013) ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019, data = db_final, method = "qda")

#Note that if all more than two variables, it does all pairs of variables - as expected, logMedian_Household_Income_2019 and logPercent_Poverty_2019 seem to be the best
partimat(as.factor(Rural_Urban_Code_2013) ~ logMedian_Household_Income_2019 + logPercent_Poverty_2019 + logCovid_Confirmed_Cases_as_pct, data = db_final, method = "qda")

Hence, stepwise discriminant analysis (both LDA and QDA) suggests to build a model with logMedian_Household_Income_2019 and logPercent_Poverty_2019 as the signifincant continuous variables. COVID-19 cases as percent of total county population is not as significant of a predictor than these other two variables.

3 Wilk’s Lambda Test

To determine if there is statistical evidence that the multivariate group means are different, use the MANOVA test.

county.manova <- manova(as.matrix(db_final[, 3:5]) ~ db_final$Rural_Urban_Code_2013)
summary.manova(county.manova, test = "Wilks")
##                                  Df   Wilks approx F num Df den Df    Pr(>F)
## db_final$Rural_Urban_Code_2013    2 0.74937    146.7      6   5672 < 2.2e-16
## Residuals                      2838                                         
##                                   
## db_final$Rural_Urban_Code_2013 ***
## Residuals                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Wilk’s lambda value is significant (p < 2.2e-16) so we conclude multivariate means are not the same.

4 Discriminant Functions

5 Classification

Regular:

# raw results
county.disc <- lda(db_final[,3:5], grouping = db_final$Rural_Urban_Code_2013, CV=FALSE)
#(ct <- table(db_final$Rural_Urban_Code_2013, predict(county.disc)$class))
(ctraw <- table(db_final$Rural_Urban_Code_2013, predict(county.disc)$class))
##           
##            Rural Suburban Urban
##   Rural      616      132   195
##   Suburban   402      191   260
##   Urban      205      114   726
# total percent correct
round(sum(diag(prop.table(ctraw))), 2)
## [1] 0.54

Note that the discriminating ability of our functions is better at the extremes of urban and rural but worse at the in-between suburban. 65% of rural counties were assigned properly and 69% of urban counties were assigned properly, versus only 22% of suburban counties. From an intuitive perspective, this makes sense: cities are very different from rural areas, whereas the surburbs fall somewhere in-between. This also makes statistical sense just by looking at our sample sizes: there are 1045 urban counties, 943 rural counties, and 853 surban counties; our function is better at discriminating the groups that were more represented in the data. Overall, we have an accuracy of 54%.

Cross-Validation:

#cross validated results
county.discCV <- lda(db_final[,3:5], grouping = db_final$Rural_Urban_Code_2013, CV = TRUE)
(ctCV <- table(db_final$Rural_Urban_Code_2013, county.discCV$class))
##           
##            Rural Suburban Urban
##   Rural      616      132   195
##   Suburban   402      191   260
##   Urban      205      114   726
# total percent correct
round(sum(diag(prop.table(ctCV))),2)
## [1] 0.54

6 Best Discriminants